This flow is based on: https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5747346/. RRBS data are assumed to have two columns per sample, one with the “-Me” suffix and the other with the “-Un” suffix. Additional comments about RRBS data:
This paper uses glmFit-based pipeline, whereas others recommend using glmQFit as it is more accurate for controlling type 1 error - see comments in https://support.bioconductor.org/p/76790/#:~:text=glmQLFit%20uses%20the%20trended%20NB,a%20GLM%20%2D%20and%20that’s%20it
# edgeR's tolerance parameter: default is 1e-06, but this seems to be too slow
edger_tol = 1e-05
# Specify the parameters required for the analysis
# This dir should have the motrpac-bic-norm-qc repo
repo_local_dir = "~/Desktop/repos/"
# Runnable command for gsutil
gsutil_cmd = "~/google-cloud-sdk/bin/gsutil"
# Where should the data be downloaded to
local_data_dir = "~/Desktop/MoTrPAC/data/pass_1b/rrbs/"
# Bucket structure is: tissue/platform/results/<data files>. For GET, this path should
# also have a qa_qc directory with the qc_metrics and sample_metadata files.
# bucket = "gs://motrpac-release-data-staging/Results/pass1b-06/epigenomics/"
bucket = "gs://motrpac-external-release2-data-staging/Results/pass1b-06/epigenomics/"
# Specify bucket and local path for the phenotypic data
# this path is internal to BIC - faster
pheno_bucket = "gs://bic_data_analysis/pass1b/pheno_dmaqc/merged2021-03-22/"
pheno_local_dir = "~/Desktop/MoTrPAC/data/pass_1b/dmaqc_pheno/"
# we require a site to have a total count
# (both methylated and unmethylated) of at least min_count_coverage in
# at least min_per_samples * (#samples);
# specifying min_per_samples=1 means all samples
min_count_coverage = 8
min_per_samples = 1
# parameters for defining promoters:
# define region sizes around TSS per gene
map_to_promoters=T
promoter_coord = c(-1000,2000)
# Specify how many PCs to examine
num_pcs = 5
num_pcs_for_outlier_analysis = 3
# Specify the significance level for association analysis
# (e.g., between a PC and a clinical variable)
p_thr = 0.0001
OUTLIER_IQR_THR = 5
# output path
data_out_bucket = "gs://mawg-data/pass1b-06/epigen-rrbs/data/"
dea_out_bucket = "gs://mawg-data/pass1b-06/epigen-rrbs/dea/"
# Define technical and biological variables to be considered in the QC
pipeline_qc_cols = c("reads","pct_Unaligned","pct_GC","pct_chrM")
biospec_cols = c("registration.sex","key.anirandgroup",
"registration.batchnumber",
"training.staffid",
"is_control",
"vo2.max.test.vo2_max_visit1", # this assumes that visit1's are aligned
"terminal.weight.mg","time_to_freeze",
"timepoint","bid","pid")
# load functions and libraries
source(paste0(repo_local_dir,"motrpac-bic-norm-qc/tools/unsupervised_normalization_functions.R"))
source(paste0(repo_local_dir,"motrpac-bic-norm-qc/tools/gcp_functions.R"))
source(paste0(repo_local_dir,"motrpac-bic-norm-qc/tools/qc_report_aux_functions.R"))
source(paste0(repo_local_dir,"motrpac-bic-norm-qc/tools/config_session.R"))
source(paste0(repo_local_dir,"motrpac-bic-norm-qc/tools/association_analysis_methods.R"))
Get the Rat annotations:
# manage the gene annotation
# # Install the rat db
# if (!requireNamespace("BiocManager", quietly = TRUE))
# install.packages("BiocManager")
# BiocManager::install("org.Rn.eg.db")
library("org.Rn.eg.db")
## Loading required package: AnnotationDbi
##
## Attaching package: 'AnnotationDbi'
## The following object is masked from 'package:dplyr':
##
## select
##
This flow is based on: https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5747346/ Note that RRBS data are assumed to have two columns per sample, one with the “-Me” suffix and the other with the “-Un” suffix.
First, download the data to local dir (skip this if already downloaded, these are large data tables).
obj = download_bucket_to_local_dir(bucket,local_path=local_data_dir,
remove_prev_files = T,GSUTIL_PATH=gsutil_cmd)
# Second, delete bismark files
bis_files = obj$downloaded_files[
grepl("bismark.cov.gz",obj$downloaded_files)
]
rem_obj = sapply(bis_files,file.remove)
Now load and filter the RRBS data:
obj = list(
downloaded_files = list.files(local_data_dir,full.names = T,recursive = T)
)
obj$downloaded_files = obj$downloaded_files[
grepl("epigen-rrbs",obj$downloaded_files)
]
rdata_files = obj$downloaded_files[
grepl(".RData$",ignore.case = T,obj$downloaded_files)
]
names(rdata_files) = sapply(rdata_files,GET_get_tissue_from_download_path)
tissues = names(rdata_files)
tissue2rrbs_data = list()
for (tissue in tissues){
curr_rdata_file = rdata_files[grepl(tissue,rdata_files)]
if(length(curr_rdata_file)!=1){
print(paste("Warning: more than a single RData file in tissue:",tissue))
}
yall = get(load(curr_rdata_file))
# Filtering unassembled chromosomes
keep <- rep(TRUE, nrow(yall))
Chr <- as.character(yall$genes$Chr)
keep[ grep("random",Chr) ] <- FALSE
keep[ grep("chrUn",Chr) ] <- FALSE
# remove non-chr ones
keep[!grepl("chr",Chr) ] <- FALSE
# remove M chromosome (otherwise we get error when assigning annotation below)
keep[Chr=="chrM"] <- FALSE
print(paste("Data from tissue",tissue,"was loaded into session"))
print(paste("how many sites were removed (unassembled+chrM)?",sum(!keep)))
# keep.lib.sizes=FALSE causes the library sizes to be recomputed
yall <- yall[keep,, keep.lib.sizes=FALSE]
# fix the levels and order by chromosome
# rat genome chromosomes:
ChrNames <- paste0("chr",c(1:20,"X","Y"))
#ChrNames <- paste0("chr",c(1:2))
yall$genes$Chr <- factor(yall$genes$Chr, levels=ChrNames)
o <- order(yall$genes$Chr, yall$genes$Locus)
yall <- yall[o,]
print(paste("Counts matrix dim before low counts filter:"))
print(dim(yall))
gc()
# get locations, gene ids, etc.
TSS <- nearestTSS(yall$genes$Chr, yall$genes$Locus, species="Rn")
yall$genes$EntrezID <- TSS$gene_id
yall$genes$Symbol <- TSS$symbol
yall$genes$Strand <- TSS$strand
yall$genes$Distance <- TSS$distance
yall$genes$Width <- TSS$width
#head(yall$genes)
if(map_to_promoters){
# Map data to promoters and sum over regions
InPromoter <- yall$genes$Distance >= promoter_coord[1] & yall$genes$Distance <= promoter_coord[2]
# We subset the CpGs to those contained in a promoter region:
yIP <- yall[InPromoter,,keep.lib.sizes=FALSE]
# Compute total counts
ypr <- rowsum(yIP, yIP$genes$EntrezID, reorder=FALSE)
yall = ypr
gc()
}
# Remove low counts (by promoter)
# The analysis needs to be restricted to CpG sites that have enough coverage for the
# methylation level to be measurable in a meaningful way at that site.
# As a conservative rule of thumb, we require a site to have a total count
# (both methylated and unmethylated) of at least 8 in every sample.
Coverage <- yall$counts[,grepl("-Me",colnames(yall$counts))] +
yall$counts[, grepl("-Un",colnames(yall$counts))]
gc()
# see description of min_count_coverage and min_per_samples above
keep <- rowSums(Coverage >= min_count_coverage) >= min_per_samples*ncol(Coverage)
# filter the data
y <- yall[keep,, keep.lib.sizes=FALSE]
print(paste("Counts matrix dim after low counts filter:"))
print(dim(y))
# Data normalization
# A key difference between BS-seq and other sequencing data is that the pair of libraries
# holding the methylated and unmethylated reads for a particular sample are treated as a unit.
# To ensure that the methylated and unmethylated reads for the same sample are treated on the
# same scale, we need to set the library sizes to be equal for each pair of libraries.
# We set the library sizes for each sample to be the average of the total read counts for
# the methylated and unmethylated libraries.
# Other normalization methods developed for RNA-seq data, such as TMM, are not required for BS-seq data.
TotalLibSize <- y$samples$lib.size[grepl("-Me",colnames(yall$counts))] +
+ y$samples$lib.size[grepl("-Un",colnames(yall$counts))]
y$samples$lib.size <- rep(TotalLibSize, each=2)
tissue2rrbs_data[[tissue]] = y
# rm(yall);rm(y);gc()
}
## [1] "Data from tissue t52-hippocampus was loaded into session"
## [1] "how many sites were removed (unassembled+chrM)? 72675"
## [1] "Counts matrix dim before low counts filter:"
## [1] 6538607 104
## [1] "Counts matrix dim after low counts filter:"
## [1] 12244 104
## [1] "Data from tissue t55-gastrocnemius was loaded into session"
## [1] "how many sites were removed (unassembled+chrM)? 74600"
## [1] "Counts matrix dim before low counts filter:"
## [1] 6869469 104
## [1] "Counts matrix dim after low counts filter:"
## [1] 12387 104
## [1] "Data from tissue t58-heart was loaded into session"
## [1] "how many sites were removed (unassembled+chrM)? 72607"
## [1] "Counts matrix dim before low counts filter:"
## [1] 6597238 104
## [1] "Counts matrix dim after low counts filter:"
## [1] 12376 104
## [1] "Data from tissue t59-kidney was loaded into session"
## [1] "how many sites were removed (unassembled+chrM)? 73240"
## [1] "Counts matrix dim before low counts filter:"
## [1] 6638813 104
## [1] "Counts matrix dim after low counts filter:"
## [1] 12329 104
## [1] "Data from tissue t66-lung was loaded into session"
## [1] "how many sites were removed (unassembled+chrM)? 70433"
## [1] "Counts matrix dim before low counts filter:"
## [1] 6437815 104
## [1] "Counts matrix dim after low counts filter:"
## [1] 12302 104
## [1] "Data from tissue t68-liver was loaded into session"
## [1] "how many sites were removed (unassembled+chrM)? 71495"
## [1] "Counts matrix dim before low counts filter:"
## [1] 6499281 104
## [1] "Counts matrix dim after low counts filter:"
## [1] 12304 104
## [1] "Data from tissue t69-brown-adipose was loaded into session"
## [1] "how many sites were removed (unassembled+chrM)? 78814"
## [1] "Counts matrix dim before low counts filter:"
## [1] 7385104 104
## [1] "Counts matrix dim after low counts filter:"
## [1] 12443 104
## [1] "Data from tissue t70-white-adipose was loaded into session"
## [1] "how many sites were removed (unassembled+chrM)? 74742"
## [1] "Counts matrix dim before low counts filter:"
## [1] 6962440 104
## [1] "Counts matrix dim after low counts filter:"
## [1] 12357 104
# Get the pipeline qc scores
pipeline_scores_file = obj$downloaded_files[
grepl("qa-qc-metrics",obj$downloaded_files) &
grepl(".csv$",obj$downloaded_files)
]
if(length(pipeline_scores_file)!=1){
print("Error: RRBS data does not have a single pipeline scores file")
}
pipelineqc_scores = read.csv(pipeline_scores_file)
rownames(pipelineqc_scores) = as.character(pipelineqc_scores$vial_label)
Add the phenotypic data
#Add the phenotypic data
pheno_data = parse_pheno_data(pheno_bucket,local_path = pheno_local_dir,
remove_prev_files = F,GSUTIL_PATH=gsutil_cmd)
# add a tissue variable (for convinience)
pheno_data$viallabel_data$tissue =
pheno_data$viallabel_data$specimen.processing.sampletypedescription
# add the time to freeze variable ((for convinience))
pheno_data$viallabel_data$time_to_freeze =
pheno_data$viallabel_data$calculated.variables.frozetime_after_train -
pheno_data$viallabel_data$calculated.variables.deathtime_after_train
# add a binary is_control variable
pheno_data$viallabel_data$is_control = as.numeric(grepl("control",
pheno_data$viallabel_data$key.anirandgroup,ignore.case = T))
# add the timepoint as a number
# x - the text description of the group
get_numeric_timepoint<-function(x){
v = rep(0,length(x))
tps = c("Eight"=8,"Four"=4,"One"=1,"Two"=2)
for(tp in names(tps)){
v[grepl(paste0(tp,"-week"),x,perl = T,ignore.case = T)] = tps[tp]
}
return(v)
}
pheno_data$viallabel_data$timepoint = get_numeric_timepoint(
pheno_data$viallabel_data$key.anirandgroup
)
sample_covs = pheno_data$viallabel_data[,biospec_cols]
We transform the normalized data into M values and quantile normalize the resulting matrix. These can be interpreted as the base2 logit transformation of the proportion of methylated signal at each locus.
tissue2Mmatrix = list()
for(tissue in names(tissue2rrbs_data)){
y = tissue2rrbs_data[[tissue]]
Me <- y$counts[, grepl("-Me",colnames(y$counts))]
Un <- y$counts[, grepl("-Un",colnames(y$counts))]
# The M-value can be interpreted as the base2 logit transformation of the
# proportion of methylated signal at each locus:
M <- log2(Me + 2) - log2(Un + 2)
colnames(M) <- gsub("-Me","",colnames(M))
# rownames(M) == y$genes$EntrezID # sanity check
# MDS plot
# In this plot the distance between each pair of samples represents the average logit
# change between the samples for the top most differentially methylated
# CpG loci between that pair of samples.
sex = sample_covs[colnames(M),1]
sex[sex==1] = "M";sex[sex==2] = "F"
cols = rep("blue",length(sex))
cols[sex=="M"] = "green"
plotMDS(M,labels = sex,pch = sample_covs[colnames(M),1]+20,col=cols,
main = paste("MDS plot before quantile",tissue))
samp = sample(1:ncol(M))[1:20]
samp2 = sample(1:nrow(M))[1:min(20000,nrow(M))]
boxplot(M[samp2,samp],
main=paste0(tissue,"\nbefore quantile (",nrow(M)," promoters)"),
ylab = "log2 M values",xaxt="n")
M = run_quantile_normalization(M)
boxplot(M[samp2,samp],
main=paste0(tissue,"\nafter quantile (",nrow(M)," promoters)"),
ylab = "log2 M values",xaxt="n")
tissue2Mmatrix[[tissue]] = M
plotMDS(M,labels = sex,pch = sample_covs[colnames(M),1]+20,col=cols,
main = paste("MDS plot after quantile",tissue))
}
The RRBS pipeline manual of procedures (MOP) defines a flagged sample (e.g., potentially problematic) as a sample that satisfies one of the following: (1) number of read pairs <20M, (2) Low number of uniquely mapped reads: pct_Uniq <50%, (3) mapped read count (pct_Uniq*reads) < 50% of the median mapped read count of all samples within a tissue (within a sequencing batch), (4) bisulfite conversion efficiency less than 95%, i.e lambda_pct_CpG>5%, but limited to samples with lambda_pct_Uniq>0.5%, (5)unexpected strands mapped: pct_OT<30% or pct_OB>70%; pct_CTOT>10% or pct_CTOB>10%, and (6) high duplications based on UMI: pct_umi_dup >20%.
pipelineqc_scores$pipeline_flags = rrbs_pipeline_flagged_samples(pipelineqc_scores)
mop_flagged_samples = rownames(pipelineqc_scores)[nchar(pipelineqc_scores$pipeline_flags)>0]
We plot the top two PCs for each tissue, color and shape correspond to randomization group and sex.
tissue2pca = list()
for(tissue in names(tissue2Mmatrix)){
curr_data = tissue2Mmatrix[[tissue]]
# remove vial labels that start with 8 (reference samples)
curr_data = curr_data[,grepl("^9",colnames(curr_data),perl = T)]
# remove zero variance rows
curr_data = curr_data[apply(curr_data,1,sd)>0,]
curr_pca = prcomp(t(curr_data),scale. = T,center = T,rank. = num_pcs)
curr_pcax = curr_pca$x[,1:num_pcs]
explained_var = summary(curr_pca)[["importance"]][3,5]
# plot
df = data.frame(curr_pcax,
randgroup = pheno_data$viallabel_data[rownames(curr_pcax),"key.anirandgroup"],
sex = as.character(pheno_data$viallabel_data[rownames(curr_pcax),"registration.sex"]),
stringsAsFactors = F
)
df$sex[df$sex=="1"] = "F"
df$sex[df$sex=="2"] = "M"
p = ggplot(df) +
geom_point(aes(x=PC1, y=PC2,col=randgroup, shape=sex)) +
ggtitle(tissue)
plot(p)
tissue2pca[[tissue]] = df
}
Here we analyze the top principal components (5) in each tissue and compute their association with the selected variables above (e.g., the pipeline qc metrics). We use Spearman correlation and a linear test for significance.
For each dataset we also add the correlations among the metadata variables. These two analyses should be interpreted together, as the analyzed variables are not independent.
pcs_vs_qc_var_report = c()
metadata_variable_assoc_report = c()
for(currname in names(tissue2Mmatrix)){
curr_pcax = tissue2pca[[currname]]
curr_pcax = curr_pcax[,grepl("^PC",colnames(curr_pcax))]
# get the metadata of the samples
curr_meta = cbind(pipelineqc_scores[rownames(curr_pcax),pipeline_qc_cols],
pheno_data$viallabel_data[rownames(curr_pcax),biospec_cols])
# remove metadata variables with NAs
curr_meta = curr_meta[,!apply(is.na(curr_meta),2,any)]
# remove bid and pid
curr_meta = curr_meta[,!grepl("pid|bid",names(curr_meta))]
# remove the text group description
curr_meta = curr_meta[!grepl("randgr",names(curr_meta))]
# remove fields withe zero variance
curr_meta = curr_meta[,apply(curr_meta,2,sd)>0]
# Assumption here: all metadata values are numeric
corrs = cor(curr_pcax,curr_meta,method="spearman")
corrsp = pairwise_eval(
curr_pcax[rownames(curr_meta),],
curr_meta,func=pairwise_association_wrapper,
f=1)
# Some ggplots have to be printed to be shown in the notebook
print(ggcorrplot(t(corrs),lab=T,lab_size=2.5,hc.order = F) +
ggtitle(currname) +
theme(plot.title = element_text(hjust = 0.5,size=20)))
for(i in 1:nrow(corrsp)){
for(j in 1:ncol(corrsp)){
if(corrsp[i,j]>p_thr){next}
pcs_vs_qc_var_report = rbind(pcs_vs_qc_var_report,
c(currname,
rownames(corrsp)[i],colnames(corrsp)[j],corrs[i,j],corrsp[i,j])
)
}
}
colnames(pcs_vs_qc_var_report) = c("Dataset(tissue,site)","PC",
"qc_metric","rho(spearman)","p-value")
####
# compute correlations among the metadata variables
####
corrs = cor(curr_meta,method="spearman")
corrsp = pairwise_eval(
curr_meta,func=pairwise_association_wrapper,
f=1)
# Some ggplots have to be printed to be shown in the notebook
print(ggcorrplot(corrs,lab=T,lab_size=1.5,hc.order = T))
for(n1 in rownames(corrsp)){
for(n2 in rownames(corrsp)){
if(n1==n2){break}
if(n1 %in% biospec_cols &&
n2 %in% biospec_cols) {next}
if(corrsp[n1,n2]>p_thr){next}
metadata_variable_assoc_report = rbind(metadata_variable_assoc_report,
c(currname,n1,n2,corrs[n1,n2],corrsp[n1,n2])
)
}
}
if(!is.null(dim(metadata_variable_assoc_report))){
colnames(metadata_variable_assoc_report) = c(
"Dataset(tissue,site)","Var1","Var2","rho(spearman)","p-value")
}
}
# Format the reports - for a nicer presentation in a table
pcs_vs_qc_var_report[,5] = format(
as.numeric(pcs_vs_qc_var_report[,5]),digits=3)
pcs_vs_qc_var_report[,4] = format(
as.numeric(pcs_vs_qc_var_report[,4]),digits=3)
metadata_variable_assoc_report[,5] = format(
as.numeric(metadata_variable_assoc_report[,5]),digits=3)
metadata_variable_assoc_report[,4] = format(
as.numeric(metadata_variable_assoc_report[,4]),digits=3)
kable(pcs_vs_qc_var_report,longtable=T,
caption = "Correlations between PCs and other variables") %>%
kable_styling(latex_options = c("repeat_header"),font_size = 8)
| Dataset(tissue,site) | PC | qc_metric | rho(spearman) | p-value |
|---|---|---|---|---|
| t52-hippocampus | PC1 | reads | -0.808 | 4.40e-08 |
| t52-hippocampus | PC1 | pct_chrM | 0.696 | 1.19e-07 |
| t52-hippocampus | PC1 | registration.sex | -0.866 | 2.50e-40 |
| t52-hippocampus | PC1 | vo2.max.test.vo2_max_visit1 | 0.455 | 1.45e-05 |
| t52-hippocampus | PC1 | terminal.weight.mg | -0.746 | 2.61e-23 |
| t55-gastrocnemius | PC1 | registration.sex | 0.866 | 2.24e-56 |
| t55-gastrocnemius | PC1 | vo2.max.test.vo2_max_visit1 | -0.607 | 2.31e-06 |
| t55-gastrocnemius | PC1 | terminal.weight.mg | 0.680 | 5.58e-24 |
| t55-gastrocnemius | PC2 | pct_Unaligned | 0.600 | 2.00e-07 |
| t55-gastrocnemius | PC2 | pct_GC | -0.812 | 4.68e-10 |
| t58-heart | PC1 | pct_chrM | -0.434 | 2.44e-05 |
| t58-heart | PC1 | registration.sex | -0.866 | 4.69e-60 |
| t58-heart | PC1 | vo2.max.test.vo2_max_visit1 | 0.573 | 3.08e-06 |
| t58-heart | PC1 | terminal.weight.mg | -0.722 | 1.98e-24 |
| t58-heart | PC2 | pct_Unaligned | 0.711 | 3.66e-10 |
| t58-heart | PC2 | pct_GC | -0.597 | 1.88e-06 |
| t59-kidney | PC1 | registration.sex | -0.866 | 8.44e-34 |
| t59-kidney | PC1 | vo2.max.test.vo2_max_visit1 | 0.549 | 5.24e-06 |
| t59-kidney | PC1 | terminal.weight.mg | -0.743 | 1.20e-24 |
| t59-kidney | PC2 | reads | -0.904 | 8.64e-15 |
| t66-lung | PC1 | pct_chrM | 0.534 | 1.85e-06 |
| t66-lung | PC1 | registration.sex | -0.866 | 4.53e-56 |
| t66-lung | PC1 | vo2.max.test.vo2_max_visit1 | 0.520 | 2.75e-06 |
| t66-lung | PC1 | terminal.weight.mg | -0.768 | 2.21e-25 |
| t66-lung | PC2 | pct_Unaligned | -0.509 | 9.27e-05 |
| t66-lung | PC2 | pct_GC | 0.623 | 2.57e-05 |
| t66-lung | PC4 | reads | -0.420 | 2.64e-05 |
| t68-liver | PC1 | pct_chrM | 0.687 | 3.60e-08 |
| t68-liver | PC1 | registration.sex | -0.866 | 2.50e-58 |
| t68-liver | PC1 | vo2.max.test.vo2_max_visit1 | 0.634 | 1.23e-06 |
| t68-liver | PC1 | terminal.weight.mg | -0.824 | 5.03e-26 |
| t68-liver | PC2 | pct_Unaligned | -0.655 | 7.17e-08 |
| t68-liver | PC2 | pct_GC | 0.497 | 7.13e-06 |
| t69-brown-adipose | PC1 | registration.sex | -0.866 | 1.65e-11 |
| t69-brown-adipose | PC1 | vo2.max.test.vo2_max_visit1 | 0.622 | 1.51e-05 |
| t69-brown-adipose | PC1 | terminal.weight.mg | -0.746 | 3.86e-09 |
| t69-brown-adipose | PC2 | pct_chrM | -0.380 | 9.21e-06 |
| t69-brown-adipose | PC2 | registration.sex | 0.766 | 1.85e-06 |
| t69-brown-adipose | PC2 | terminal.weight.mg | 0.697 | 1.85e-06 |
| t69-brown-adipose | PC5 | reads | 0.855 | 2.76e-07 |
| t70-white-adipose | PC1 | pct_chrM | -0.855 | 4.35e-17 |
| t70-white-adipose | PC1 | registration.sex | -0.866 | 1.53e-19 |
| t70-white-adipose | PC1 | training.staffid | 0.495 | 3.37e-05 |
| t70-white-adipose | PC1 | vo2.max.test.vo2_max_visit1 | 0.646 | 1.76e-06 |
| t70-white-adipose | PC1 | terminal.weight.mg | -0.854 | 3.96e-18 |
| t70-white-adipose | PC4 | pct_Unaligned | -0.387 | 1.50e-06 |
kable(metadata_variable_assoc_report,longtable=T,
caption = "Correlations between metadata variables")%>%
kable_styling(latex_options = c("repeat_header"),font_size = 8)
| Dataset(tissue,site) | Var1 | Var2 | rho(spearman) | p-value |
|---|---|---|---|---|
| t52-hippocampus | pct_chrM | reads | -0.643 | 3.68e-09 |
| t52-hippocampus | registration.sex | reads | 0.528 | 1.23e-05 |
| t52-hippocampus | registration.sex | pct_chrM | -0.611 | 2.48e-06 |
| t52-hippocampus | terminal.weight.mg | reads | 0.472 | 4.43e-05 |
| t52-hippocampus | terminal.weight.mg | pct_chrM | -0.594 | 5.81e-07 |
| t55-gastrocnemius | pct_GC | pct_Unaligned | -0.493 | 2.75e-05 |
| t58-heart | registration.sex | pct_chrM | 0.566 | 1.84e-05 |
| t58-heart | terminal.weight.mg | pct_chrM | 0.531 | 1.56e-05 |
| t66-lung | registration.sex | pct_chrM | -0.677 | 1.43e-06 |
| t66-lung | terminal.weight.mg | pct_chrM | -0.612 | 2.46e-06 |
| t68-liver | registration.sex | pct_chrM | -0.711 | 6.42e-08 |
| t68-liver | terminal.weight.mg | pct_chrM | -0.640 | 7.11e-08 |
| t70-white-adipose | pct_GC | pct_Unaligned | -0.485 | 1.04e-05 |
| t70-white-adipose | registration.sex | pct_chrM | 0.856 | 1.34e-15 |
| t70-white-adipose | vo2.max.test.vo2_max_visit1 | pct_chrM | -0.530 | 3.93e-06 |
| t70-white-adipose | terminal.weight.mg | pct_chrM | 0.779 | 3.16e-17 |
In this analysis, outliers are flagged by examining the boxplot of each PC, extending its whiskers to three times the inter-quantile range away from the boxplot. Samples outside this range are then flagged.
Note that samples are flagged using an automatic analysis of the principal components. Such analyses may flag samples because of true biological effects and thus further examination is required before determining if flagged samples represent low quality samples.
pca_outliers_report = c()
for(currname in names(tissue2pca)){
curr_pcax = tissue2pca[[currname]]
curr_pcax = curr_pcax[,grepl("^PC",colnames(curr_pcax))]
# Univariate: use IQRs
pca_outliers = c()
for(j in 1:ncol(curr_pcax)){
outlier_values <- boxplot.stats(curr_pcax[,j],coef = OUTLIER_IQR_THR)$out
for(outlier in names(outlier_values)){
pca_outliers_report = rbind(pca_outliers_report,
c(currname,paste("PC",j,sep=""),outlier,
format(outlier_values[outlier],digits=5))
)
if(!is.element(outlier,names(pca_outliers))){
pca_outliers[outlier] = outlier_values[outlier]
}
}
}
# Plot the outliers
if(length(pca_outliers)>0){
# print(length(kNN_outliers))
df = data.frame(curr_pcax,
outliers = rownames(curr_pcax) %in% names(pca_outliers))
col = rep("black",nrow(df))
col[df$outliers] = "green"
plot(df$PC1,df$PC2,pch = as.numeric(df$outliers),col=col,lwd=2,cex=1,
main = paste(currname,"flagged outliers"),
xlab = "PC1",ylab="PC2")
plot(df$PC3,df$PC4,pch = as.numeric(df$outliers),col=col,lwd=2,cex=1,
main = paste(currname,"flagged outliers"),
xlab = "PC3",ylab="PC4")
}
}
if(!is.null(dim(pca_outliers_report))){
colnames(pca_outliers_report) = c("dataset","PC","sample","score")
}
We verify that the sex of a sample can be predicted from the percent of chromosome Y and percent of chromosome X reads. This is done automatically by training a logistic regression classifier, but we also plot the 2D plot for each tissue.
# sex checks - using simple logistic regression classifier (LOO-CV)
sex_read_info = cbind(pipelineqc_scores$pct_chrX,pipelineqc_scores$pct_chrY)
rownames(sex_read_info) = rownames(pipelineqc_scores)
sex_check_outliers = c()
for(currname in names(tissue2Mmatrix)){
# 1 is female
pca_df = tissue2pca[[currname]]
read_info = sex_read_info[rownames(pca_df),]
df = data.frame(sex=as.factor(pca_df$sex),
pct_chrX = read_info[,1],
pct_chrY = read_info[,2],stringsAsFactors = F)
p = ggplot(df) +
geom_point(aes(x=pct_chrX, y=pct_chrY,col=sex, shape=sex)) +
ggtitle(paste0(currname," - sex check"))
plot(p)
train_control <- trainControl(method = "cv", number = nrow(df),
savePredictions = TRUE)
# train the model on training set
model <- train(sex ~ .,data = df,
trControl = train_control,
method = "glm", family=binomial(link="logit"))
# CV redults
cv_res = model$pred
cv_errors = cv_res[,1] != cv_res[,2]
err_samples = rownames(df)[cv_res[cv_errors,"rowIndex"]]
for(samp in err_samples){
sex_check_outliers = rbind(sex_check_outliers,
c(currname,samp))
}
}
if(! is.null(dim(sex_check_outliers))){
colnames(sex_check_outliers) = c("dataset","sample")
}
# add the dataset name to mop outliers
sample2dataset = paste(tolower(pipelineqc_scores[,"Tissue"]),
tolower(pipelineqc_scores[,"GET_site"]),sep=",")
names(sample2dataset) = rownames(pipelineqc_scores)
all_flagged = NULL
if(length(pca_outliers_report)>0){
all_flagged = union(all_flagged,pca_outliers_report[,"sample"])
}
if(length(sex_check_outliers)>0){
all_flagged = union(all_flagged,sex_check_outliers[,"sample"])
}
all_flagged = union(all_flagged,mop_flagged_samples)
flagged_sample_report = c()
for(samp in all_flagged){
samp_dataset = sample2dataset[samp]
samp_metric = NULL
if(samp %in% mop_flagged_samples){
samp_metric = pipelineqc_scores[samp,"pipeline_flags"]
}
if(!is.null(dim(sex_check_outliers)) &&
samp %in% sex_check_outliers[,"sample"]){
samp_metric = c(samp_metric,"Sex-flagged")
}
if(!is.null(dim(pca_outliers_report)) &&
samp %in% pca_outliers_report[,"sample"]){
curr_pcs = pca_outliers_report[pca_outliers_report[,"sample"]==samp,"PC"]
samp_metric = c(samp_metric,curr_pcs)
}
flagged_sample_report = rbind(flagged_sample_report,
c(samp,samp_dataset,paste(samp_metric,collapse=","))
)
}
colnames(flagged_sample_report) = c("Viallabel","Dataset","Methods")
kable(flagged_sample_report,longtable=T,
caption = "Outliers detected, MOP, Sex check, and by tissue PCA data")%>%
kable_styling(font_size = 8,latex_options = c("hold_position", "repeat_header"))
| Viallabel | Dataset | Methods |
|---|---|---|
| 90232017004 | white_adipose,mssm | NumReads<20M |
| 90217015903 | kidney,mssm | NumReads<20M |
| 90223015903 | kidney,mssm | NumReads<20M |
| 90227015903 | kidney,mssm | NumReads<20M |
| 90248015903 | kidney,mssm | NumReads<20M |
| 90254015903 | kidney,mssm | NumReads<20M |
| 90283015903 | kidney,mssm | NumReads<20M |
| 90239015203 | hippocampus,mssm | NumReads<20M |
| 90254015203 | hippocampus,mssm | NumReads<20M |
| 90259015203 | hippocampus,mssm | NumReads<20M |
| 90251016604 | lung,mssm | NumReads<20M |
# # use the raw bucket names as the output location
# # (implied assumption: rna_seq_data and norm_rnaseq_data have the same order)
# get_out_bucket_from_local_file_name<-function(b,input_bucket,shift_size=3){
# arr = strsplit(b,split="/")[[1]]
# n = length(arr)
# local_b = paste(arr[(n-shift_size):(n-1)],collapse="/")
# out_bucket = paste0(input_bucket,"/",local_b,"/")
# out_bucket = gsub("//","/",out_bucket)
# out_bucket = gsub("gs:/+","gs://",out_bucket)
# return(out_bucket)
# }
# out_buckets = sapply(rdata_files,
# get_out_bucket_from_local_file_name,input_bucket=bucket)
for(tissue in names(tissue2Mmatrix)){
# out_b = out_buckets[tissue]
x = tissue2Mmatrix[[tissue]]
x_bids = pheno_data$viallabel_data[colnames(x),"bid"]
x_pids = pheno_data$viallabel_data[colnames(x),"pid"]
newx = cbind(rownames(x),x)
rownames(newx) = 1:nrow(x)
for(j in 2:ncol(newx)){
newx[,j] = round(as.numeric(newx[,j]),digits=5)
}
newx = rbind(c("bid",x_bids),newx)
newx = rbind(c("pid",x_pids),newx)
newx = rbind(c("viallabel",colnames(x)),newx)
colnames(newx) = NULL
# a simple sanity check
m = newx[-c(1:3),-1]
mode(m) = "numeric"
if(!all(abs(m-x)< 1e-05)){
print("Error in parsing the matrix, breaking")
break
}
# Add annotation
y = tissue2rrbs_data[[tissue]]
x_annot = y$genes
x_annot = sapply(x_annot,as.character)
if(!all(x_annot[,"EntrezID"] == rownames(x))){
print(paste("Error: order of M matrix does not fit the annotation data in tissue:",tissue))
}
x_annot = rbind(colnames(x_annot),x_annot)
# save the output to the target bucket
local_fname = paste0(local_data_dir,
"motrpac_pass1b-06_",tissue,"_epigen-rrbs_",
"normalized-log-M-promoter.txt")
fwrite(newx,file=local_fname,sep="\t",quote=F,
row.names = F,col.names = F)
cmd = paste(gsutil_cmd,"cp",local_fname,data_out_bucket)
system(cmd)
local_fname2 = paste0(local_data_dir,
"motrpac_pass1b-06_",tissue,"_epigen-rrbs_",
"normalized-data-feature-annot.txt")
fwrite(x_annot,file=local_fname2,sep="\t",quote=F,
row.names = F,col.names = F)
cmd = paste(gsutil_cmd,"cp",local_fname2,data_out_bucket)
system(cmd)
}
## x being coerced from class: matrix to data.table
## x being coerced from class: matrix to data.table
## x being coerced from class: matrix to data.table
## x being coerced from class: matrix to data.table
## x being coerced from class: matrix to data.table
## x being coerced from class: matrix to data.table
## x being coerced from class: matrix to data.table
## x being coerced from class: matrix to data.table
## x being coerced from class: matrix to data.table
## x being coerced from class: matrix to data.table
## x being coerced from class: matrix to data.table
## x being coerced from class: matrix to data.table
## x being coerced from class: matrix to data.table
## x being coerced from class: matrix to data.table
## x being coerced from class: matrix to data.table
## x being coerced from class: matrix to data.table
ftest_res = c() # keeps all ftest results
sex_ftest_res = c() # keeps the sex-specific f-tests
tpDA = c() # keep the timewise results
for(dataset_name in names(tissue2rrbs_data)){
print(paste("Analyzing dataset:",dataset_name))
y = tissue2rrbs_data[[dataset_name]]
is_sample = grepl("^9",colnames(y),perl=T)
y = y[,is_sample]
curr_samps = sapply(colnames(y),function(x)strsplit(x,split="-")[[1]][1])
# Parse the covariates
covs = data.frame(
sample = as.factor(curr_samps),
Me = grepl("Me",colnames(y)),
sex = pheno_data$viallabel_data[curr_samps,"registration.sex"],
timepoint = pheno_data$viallabel_data[curr_samps,"timepoint"],
is_control = pheno_data$viallabel_data[curr_samps,"is_control"]
)
covs$sex[covs$sex=="2"] = "M";covs$sex[covs$sex=="1"]="F"
rownames(covs) = colnames(y)
# Create design matrices for time-wise and the F tests
covs$group_sex_tp=factor(paste(covs$sex,covs$timepoint,covs$is_control,sep="_"))
covs$group_tp=factor(paste(covs$timepoint,covs$is_control,sep="_"))
samples_mat = model.matrix(~0+sample,data=covs)
sex_tp_mat = model.matrix(~0+group_sex_tp,data=covs)
tp_mat = model.matrix(~0+sex+group_tp+sex:group_tp,data=covs)
sex_tp_mat[!covs$Me,] = 0
tp_mat[!covs$Me,] = 0
if(rankMatrix(cbind(sex_tp_mat,tp_mat))[1] != ncol(sex_tp_mat)){
print(paste("Warning in",dataset_name,
"check the design matrices, the two eq matrices have different ranks"))
}
des = cbind(samples_mat,sex_tp_mat)
colnames(des) = gsub("group_sex_tp","",colnames(des))
# A design matrix for the more general F tests
full_model_str = "~0+sample+sex+group+sex:group"
null_model_str = "~0+sample+sex"
# (see https://support.bioconductor.org/p/12119/ and the limma guide)
des2 = cbind(samples_mat,tp_mat)
# define the contrasts for the analyses below
C_ttests = makeContrasts(
M_1_0 - M_8_1,M_2_0 - M_8_1,M_4_0 - M_8_1,M_8_0 - M_8_1,
F_1_0 - F_8_1,F_2_0 - F_8_1,F_4_0 - F_8_1,F_8_0 - F_8_1,
levels = des
)
print("Estimating dispersion for the first design matrix...")
y1 <- estimateDisp(y, design=des,tol = edger_tol)
print("Done")
print("Running glmQLFit...")
fit.ttest <- glmQLFit(y1,des)
print("Done")
# extract contrast info
print("Extracting timewise results from the contrasts...")
for(col in colnames(C_ttests)){
curr_sex = strsplit(col,split="_")[[1]][1]
sex_str = "male"
if(curr_sex=="F"){sex_str="female"}
curr_tp = strsplit(col,split="_")[[1]][2]
print(paste("Fitting timewise model for:",col))
res <- glmQLFTest(fit.ttest,contrast=C_ttests[,col])
plotMD(res)
# extract the results into a table
edger_res <- topTags(res, n=Inf, p.value = 1,
adjust.method = "BH",sort.by = "none")$table
# add z-scores
edger_res$F[edger_res$F < 0] = 1e-10
t.stat <- sign(edger_res$logFC) * sqrt(edger_res$F)
z <- zscoreT(t.stat, df=res$df.total)
curr_res = data.frame(
feature_ID = rownames(y),
edger_res[,1:4],
assay = "epigen-rrbs",
tissue = dataset_name,
sex = sex_str,
#logFC_se = TBD,
logFC = edger_res$logFC,
fscore = edger_res$F,
zscore = z,
covariates = NA,
comparison_group = paste0(curr_tp,"w"),
p_value = edger_res$PValue,
adj_p_value = edger_res$FDR
)
# add the results
tpDA = rbind(tpDA,curr_res)
}
print("Done")
print("Estimating dispersion for the second design matrix...")
y2 <- estimateDisp(y, design=des2,tol = edger_tol)
print("Done")
if(any(abs(y1$trended.dispersion-y2$trended.dispersion)>1e-05) ||
any(abs(y1$tagwise.dispersion-y2$tagwise.dispersion)>1e-05)){
print(paste("Warning in",dataset_name,
"dispersion estimates are inconsistent, please check"))
}
print("Fitting the model for the F-tests...")
fit.ftest <- glmQLFit(y2,des2)
is_group_variable = grepl("group",colnames(des2))
res <- glmQLFTest(fit.ftest,coef=colnames(des2)[is_group_variable])
ftest_edger_res <- topTags(res, n=Inf, p.value = 1,
adjust.method = "BH",sort.by = "none")$table
print("Done")
# add the results
curr_f_test_res = data.frame(
feature_ID = rownames(y),
ftest_edger_res[,1:4],
assay = "epigen-rrbs",
tissue = dataset_name,
p_value = ftest_edger_res$PValue,
adj_p_value = ftest_edger_res$FDR,
fscore = ftest_edger_res$F,
full_model = full_model_str,
reduced_model = null_model_str
)
ftest_res = rbind(ftest_res,curr_f_test_res)
#print(table(ftest_res$adj_p_value < 0.1))
print("Fitting model for the sex-assoc tests...")
is_interaction_variable = grepl("group",colnames(des2)) & grepl("sex",colnames(des2))
res <- glmQLFTest(fit.ftest,coef=colnames(des2)[is_interaction_variable])
sex_edger_res <- topTags(res, n=Inf, p.value = 1,
adjust.method = "BH",sort.by = "none")$table
print("Done")
# add the results
curr_sex_f_test_res = data.frame(
feature_ID = rownames(y),
sex_edger_res[,1:4],
assay = "epigen-rrbs",
tissue = dataset_name,
p_value = sex_edger_res$PValue,
adj_p_value = sex_edger_res$FDR,
fscore = sex_edger_res$F,
full_model = full_model_str,
reduced_model = "~1+sample+sex+group"
)
sex_ftest_res = rbind(sex_ftest_res,curr_sex_f_test_res)
# free space, erase objects
rm(edger_res);rm(sex_edger_res);rm(ftest_edger_res)
rm(des);rm(des2)
rm(y);rm(y1);rm(y2)
gc()
}
## [1] "Analyzing dataset: t52-hippocampus"
## [1] "Estimating dispersion for the first design matrix..."
## [1] "Done"
## [1] "Running glmQLFit..."
## [1] "Done"
## [1] "Extracting timewise results from the contrasts..."
## [1] "Fitting timewise model for: M_1_0 - M_8_1"
## [1] "Fitting timewise model for: M_2_0 - M_8_1"
## [1] "Fitting timewise model for: M_4_0 - M_8_1"
## [1] "Fitting timewise model for: M_8_0 - M_8_1"
## [1] "Fitting timewise model for: F_1_0 - F_8_1"
## [1] "Fitting timewise model for: F_2_0 - F_8_1"
## [1] "Fitting timewise model for: F_4_0 - F_8_1"
## [1] "Fitting timewise model for: F_8_0 - F_8_1"
## [1] "Done"
## [1] "Estimating dispersion for the second design matrix..."
## [1] "Done"
## [1] "Fitting the model for the F-tests..."
## [1] "Done"
## [1] "Fitting model for the sex-assoc tests..."
## [1] "Done"
## [1] "Analyzing dataset: t55-gastrocnemius"
## [1] "Estimating dispersion for the first design matrix..."
## [1] "Done"
## [1] "Running glmQLFit..."
## [1] "Done"
## [1] "Extracting timewise results from the contrasts..."
## [1] "Fitting timewise model for: M_1_0 - M_8_1"
## [1] "Fitting timewise model for: M_2_0 - M_8_1"
## [1] "Fitting timewise model for: M_4_0 - M_8_1"
## [1] "Fitting timewise model for: M_8_0 - M_8_1"
## [1] "Fitting timewise model for: F_1_0 - F_8_1"
## [1] "Fitting timewise model for: F_2_0 - F_8_1"
## [1] "Fitting timewise model for: F_4_0 - F_8_1"
## [1] "Fitting timewise model for: F_8_0 - F_8_1"
## [1] "Done"
## [1] "Estimating dispersion for the second design matrix..."
## [1] "Done"
## [1] "Fitting the model for the F-tests..."
## [1] "Done"
## [1] "Fitting model for the sex-assoc tests..."
## [1] "Done"
## [1] "Analyzing dataset: t58-heart"
## [1] "Estimating dispersion for the first design matrix..."
## [1] "Done"
## [1] "Running glmQLFit..."
## [1] "Done"
## [1] "Extracting timewise results from the contrasts..."
## [1] "Fitting timewise model for: M_1_0 - M_8_1"
## [1] "Fitting timewise model for: M_2_0 - M_8_1"
## [1] "Fitting timewise model for: M_4_0 - M_8_1"
## [1] "Fitting timewise model for: M_8_0 - M_8_1"
## [1] "Fitting timewise model for: F_1_0 - F_8_1"
## [1] "Fitting timewise model for: F_2_0 - F_8_1"
## [1] "Fitting timewise model for: F_4_0 - F_8_1"
## [1] "Fitting timewise model for: F_8_0 - F_8_1"
## [1] "Done"
## [1] "Estimating dispersion for the second design matrix..."
## [1] "Done"
## [1] "Fitting the model for the F-tests..."
## [1] "Done"
## [1] "Fitting model for the sex-assoc tests..."
## [1] "Done"
## [1] "Analyzing dataset: t59-kidney"
## [1] "Estimating dispersion for the first design matrix..."
## [1] "Done"
## [1] "Running glmQLFit..."
## [1] "Done"
## [1] "Extracting timewise results from the contrasts..."
## [1] "Fitting timewise model for: M_1_0 - M_8_1"
## [1] "Fitting timewise model for: M_2_0 - M_8_1"
## [1] "Fitting timewise model for: M_4_0 - M_8_1"
## [1] "Fitting timewise model for: M_8_0 - M_8_1"
## [1] "Fitting timewise model for: F_1_0 - F_8_1"
## [1] "Fitting timewise model for: F_2_0 - F_8_1"
## [1] "Fitting timewise model for: F_4_0 - F_8_1"
## [1] "Fitting timewise model for: F_8_0 - F_8_1"
## [1] "Done"
## [1] "Estimating dispersion for the second design matrix..."
## [1] "Done"
## [1] "Fitting the model for the F-tests..."
## [1] "Done"
## [1] "Fitting model for the sex-assoc tests..."
## [1] "Done"
## [1] "Analyzing dataset: t66-lung"
## [1] "Estimating dispersion for the first design matrix..."
## [1] "Done"
## [1] "Running glmQLFit..."
## [1] "Done"
## [1] "Extracting timewise results from the contrasts..."
## [1] "Fitting timewise model for: M_1_0 - M_8_1"
## [1] "Fitting timewise model for: M_2_0 - M_8_1"
## [1] "Fitting timewise model for: M_4_0 - M_8_1"
## [1] "Fitting timewise model for: M_8_0 - M_8_1"
## [1] "Fitting timewise model for: F_1_0 - F_8_1"
## [1] "Fitting timewise model for: F_2_0 - F_8_1"
## [1] "Fitting timewise model for: F_4_0 - F_8_1"
## [1] "Fitting timewise model for: F_8_0 - F_8_1"
## [1] "Done"
## [1] "Estimating dispersion for the second design matrix..."
## [1] "Done"
## [1] "Fitting the model for the F-tests..."
## [1] "Done"
## [1] "Fitting model for the sex-assoc tests..."
## [1] "Done"
## [1] "Analyzing dataset: t68-liver"
## [1] "Estimating dispersion for the first design matrix..."
## [1] "Done"
## [1] "Running glmQLFit..."
## [1] "Done"
## [1] "Extracting timewise results from the contrasts..."
## [1] "Fitting timewise model for: M_1_0 - M_8_1"
## [1] "Fitting timewise model for: M_2_0 - M_8_1"
## [1] "Fitting timewise model for: M_4_0 - M_8_1"
## [1] "Fitting timewise model for: M_8_0 - M_8_1"
## [1] "Fitting timewise model for: F_1_0 - F_8_1"
## [1] "Fitting timewise model for: F_2_0 - F_8_1"
## [1] "Fitting timewise model for: F_4_0 - F_8_1"
## [1] "Fitting timewise model for: F_8_0 - F_8_1"
## [1] "Done"
## [1] "Estimating dispersion for the second design matrix..."
## [1] "Done"
## [1] "Fitting the model for the F-tests..."
## [1] "Done"
## [1] "Fitting model for the sex-assoc tests..."
## [1] "Done"
## [1] "Analyzing dataset: t69-brown-adipose"
## [1] "Estimating dispersion for the first design matrix..."
## [1] "Done"
## [1] "Running glmQLFit..."
## [1] "Done"
## [1] "Extracting timewise results from the contrasts..."
## [1] "Fitting timewise model for: M_1_0 - M_8_1"
## [1] "Fitting timewise model for: M_2_0 - M_8_1"
## [1] "Fitting timewise model for: M_4_0 - M_8_1"
## [1] "Fitting timewise model for: M_8_0 - M_8_1"
## [1] "Fitting timewise model for: F_1_0 - F_8_1"
## [1] "Fitting timewise model for: F_2_0 - F_8_1"
## [1] "Fitting timewise model for: F_4_0 - F_8_1"
## [1] "Fitting timewise model for: F_8_0 - F_8_1"
## [1] "Done"
## [1] "Estimating dispersion for the second design matrix..."
## [1] "Done"
## [1] "Fitting the model for the F-tests..."
## [1] "Done"
## [1] "Fitting model for the sex-assoc tests..."
## [1] "Done"
## [1] "Analyzing dataset: t70-white-adipose"
## [1] "Estimating dispersion for the first design matrix..."
## [1] "Done"
## [1] "Running glmQLFit..."
## [1] "Done"
## [1] "Extracting timewise results from the contrasts..."
## [1] "Fitting timewise model for: M_1_0 - M_8_1"
## [1] "Fitting timewise model for: M_2_0 - M_8_1"
## [1] "Fitting timewise model for: M_4_0 - M_8_1"
## [1] "Fitting timewise model for: M_8_0 - M_8_1"
## [1] "Fitting timewise model for: F_1_0 - F_8_1"
## [1] "Fitting timewise model for: F_2_0 - F_8_1"
## [1] "Fitting timewise model for: F_4_0 - F_8_1"
## [1] "Fitting timewise model for: F_8_0 - F_8_1"
## [1] "Done"
## [1] "Estimating dispersion for the second design matrix..."
## [1] "Done"
## [1] "Fitting the model for the F-tests..."
## [1] "Done"
## [1] "Fitting model for the sex-assoc tests..."
## [1] "Done"
Print the results to files:
out_bucket = dea_out_bucket
suffix = "20210503.txt"
# timewise tables
for(tissue in unique(tpDA$tissue)){
currDA = tpDA[tpDA$tissue == tissue,]
local_fname = paste0(local_data_dir,
"pass1b-06_",tissue,"_epigen-rrbs_timewise-dea_",suffix)
write.table(currDA,file=local_fname,sep="\t",quote=F,
row.names = F,col.names = T)
cmd = paste(gsutil_cmd,"-m cp",local_fname,out_bucket)
system(cmd)
system(paste("rm",local_fname))
}
# f-test tables
for(tissue in unique(ftest_res$tissue)){
currTable = ftest_res[ftest_res$tissue == tissue,]
local_fname = paste0(local_data_dir,
"pass1b-06_",tissue,"_epigen-rrbs_training-dea_",suffix)
write.table(currTable,file=local_fname,sep="\t",quote=F,
row.names = F,col.names = T)
cmd = paste(gsutil_cmd,"-m cp",local_fname,out_bucket)
system(cmd)
system(paste("rm",local_fname))
}
# sex-based results
for(tissue in unique(sex_ftest_res$tissue)){
currTable = sex_ftest_res[sex_ftest_res$tissue == tissue,]
local_fname = paste0(local_data_dir,
"pass1b-06_",tissue,"_epigen-rrbs_training-sex-biased_",suffix)
write.table(currTable,file=local_fname,sep="\t",quote=F,
row.names = F,col.names = T)
cmd = paste(gsutil_cmd,"-m cp",local_fname,out_bucket)
system(cmd)
system(paste("rm",local_fname))
}